package com.splider.toutiao;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.splider.toutiao.mapper.ArticleMapper;
import com.splider.toutiao.mapper.AuthorMaper;

public class TouTiaoSplider {
	
	//今日头条首页地址
	public static final String TOUTIAOURL = "http://www.toutiao.com/api/article/feed/?category=__all__&utm_source=toutiao&widen=1";
	//头条号主页地址
	public static final String TOUTIAOHAOURL = "http://www.toutiao.com/pgc/ma/?page_type=1&count=10&version=2&platform=pc";
	
	private static SqlSessionFactory sqlSessionFactory = null;
	
	private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
	
	private static int refreshCount = 0;
	
	public static void main(String[] args) {
		System.out.println("----------开始干活！-----------------");
		sqlSessionFactory = MyBatisUtils.getSqlSessionFactory();
		execute(0);
	}
	
	public static void execute(long hottime){
		refreshCount++;
		System.out.println("----------第"+refreshCount+"次刷新----------");
		String url = TOUTIAOURL + "&max_behot_time="+hottime+"&max_behot_time_tmp="+hottime;
		JSONObject param = null;
		param = CommonUtils.getUrlParam();
		url+= "&as=" + param.get("as") + "&cp=" + param.get("cp");
		JSONObject json = null;
		try {
			json = getReturnJson(url);
		} catch (Exception e) {
			e.printStackTrace();
		}
		if(json != null){
			if(json.getBooleanValue("has_more")){
				long time = json.getJSONObject("next").getLongValue("max_behot_time");
				JSONArray data = json.getJSONArray("data");
				for(int i=0;i<data.size();i++){
					try{
						JSONObject obj = (JSONObject) data.get(i);
						String author = obj.getString("source");//头条号名称
						String authorUrl = obj.getString("media_url");//头条号主页url
						//入库
						if(StringUtils.isNotEmpty(author)&&StringUtils.isNotEmpty(authorUrl)){
							Map<String, Object> authorMap = new HashMap<String, Object>();
							authorMap.put("id",0);
							authorMap.put("title", author);
							SqlSession sqlSession = sqlSessionFactory.openSession();
							AuthorMaper authorMaper = sqlSession.getMapper(AuthorMaper.class);
							Map<String, Object> authr = authorMaper.selectAuthor(authorMap);
							if(authr!=null&&authr.size()>0){
								continue;
							}
							authorMaper.insertAuthor(authorMap);
							//访问头条号主页url
							System.out.println("访问头条号："+author+",url:"+authorUrl);
							connectSubPage(sqlSession,authorMap.get("id").toString(),authorUrl,0l);
							sqlSession.commit();
							sqlSession.close();
						}
					}catch (Exception e) {
						System.out.println("error:"+e.getMessage());
					}
				}
				execute(time);
			}else{
				System.out.println("----------查询不到任何文章了，睡一个小时再抓----------");
				try {
					Thread.sleep(1000*60*60);
				} catch (InterruptedException e) {
					System.out.println("@_@被吵醒了，接着再抓吧@_@");
					execute(0);
				}
				System.out.println("----------睡醒了，开始抓----------");
				execute(0);
			}
		}
	}
	
	public static JSONObject getReturnJson(String url){
		try{
			URL httpUrl = new URL(url);
			BufferedReader in = new BufferedReader( new InputStreamReader(httpUrl.openStream(),"UTF-8") );   
			String line = null;
			String content = "";
			while ((line = in.readLine())!=null) {
				content+=line;
			}
			in.close();
			return JSONObject.parseObject(content);
		}catch (Exception e) {
			System.err.println("访问失败:"+url);
			e.printStackTrace();
		}
		return null;
	}
	
	private static void connectSubPage(SqlSession sqlSession,String authorId,String authorUrl,long hotTime){
		String media_id = authorUrl.replace("http://toutiao.com/m", "");
		media_id = media_id.replace("/", "");
		JSONObject json = getSubPage(media_id, hotTime);
		if(json!=null&&json.getIntValue("has_more")==1){
			long time = json.getJSONObject("next").getLongValue("max_behot_time");
			JSONArray data = json.getJSONArray("data");
			for(int i=0;i<data.size();i++){
				try {
					JSONObject obj = (JSONObject) data.get(i);
					String title = obj.getString("title");//标题
					String readCount = obj.getString("go_detail_count");//阅读次数
					if(readCount.contains("万")){
						readCount = readCount.replace("万", "");
						readCount = Double.parseDouble(readCount)*10000+"";
					}
					int imageCount = 0;
					if(obj.getJSONArray("image_list")!=null){
						imageCount = obj.getJSONArray("image_list").size();//展示图片数
					}
					int type = Constants.ARTICLE;//文章类型
					if(obj.getBooleanValue("has_video")){
						type = Constants.VIDEO;
					}else if(obj.getBooleanValue("has_gallery")){
						type = Constants.GALLERY;
					}
					String commentCount = obj.getString("comments_count");//评论次数
					if(commentCount!=null&&commentCount.contains("万")){
						commentCount = commentCount.replace("万", "");
						commentCount = Double.parseDouble(commentCount)*10000+"";
					}
					Date publishTime = null;//发布时间
					try {
						publishTime = sdf.parse(obj.getString("datetime"));
					} catch (ParseException e) {
						publishTime = obj.getDate("datatime");
					}
					int galleryCount = obj.getIntValue("gallery_pic_count");//相集图片数量
					String articleUrl = obj.getString("source_url");
					String section = null;
					if(type == Constants.VIDEO){
						section = "视频";
					}
					if(type == Constants.GALLERY){
						section = "图片";
					}
					String tag = null;
					if(Double.parseDouble(readCount) >= 10000){//阅读超过1万的
						if(type==Constants.ARTICLE&&StringUtils.isNotEmpty(articleUrl)){
							//访问文章页面，获取其他信息
							Map<String, Object> articleInfo = getArticleInfo(articleUrl);
							if(articleInfo!=null){
								section = articleInfo.get("section")==null?"":articleInfo.get("section").toString();
								tag = articleInfo.get("tag")==null?"":articleInfo.get("tag").toString();
							}
						}
						ArticleMapper mapper = sqlSession.getMapper(ArticleMapper.class);
						Map<String, Object> articleMap = new HashMap<String, Object>();
						articleMap.put("title",title);
						articleMap.put("readcount",readCount);
						articleMap.put("showimgcount",imageCount);
						articleMap.put("type",type);
						articleMap.put("commentcount",commentCount);
						articleMap.put("publishtime",publishTime);
						articleMap.put("gallerycount",galleryCount);
						articleMap.put("section",section);
						articleMap.put("tagword",tag);
						articleMap.put("author",authorId);
						mapper.insertArticle(articleMap);
					}
				} catch (Exception e) {
					System.out.println("error:"+e.getMessage());
				}
			}
			connectSubPage(sqlSession, authorId, authorUrl, time);
		}
	}
	
	public static JSONObject getSubPage(String media_id,long hotTime){
		String url = TOUTIAOHAOURL + "&media_id="+media_id+"&max_behot_time="+hotTime;
		JSONObject param = null;
		param = CommonUtils.getUrlParam();
		url+= "&as=" + param.get("as") + "&cp=" + param.get("cp");
		try {
			URL httpUrl = new URL(url);
			BufferedReader in = new BufferedReader( new InputStreamReader(httpUrl.openStream(),"UTF-8") );   
			String line = null;
			String content = "";
			while ((line = in.readLine())!=null) {
				content+=line;
			}
			in.close();
			return JSONObject.parseObject(content);
		} catch (Exception e) {
			System.err.println("访问子页面失败:"+url);
			e.printStackTrace();
		}
		return null;
	}
	
	public static Map<String, Object> getArticleInfo(String url){
		if(url.contains("/item/")){
			url = url.replace("tem/", "");
		}
		try {
			Map<String, Object> map = new HashMap<String, Object>();
			Connection connect = Jsoup.connect(url);
			Document document;
			document = connect.get();
			Element header = document.getElementById("header");
			Elements divs = header.getElementsByClass("chinese-tag");
			String section = "";
			if(divs!=null&&divs.size()>0){
				section = divs.get(0).getElementsByTag("a").get(1).text();
			}
			Elements tagElements = document.getElementsByClass("label-list");
			Elements tags = null;
			if(tagElements!=null&&tagElements.size()>0){
				tags = tagElements.get(0).getElementsByTag("a");
			}
			String tag = "";
			if(tags != null){
				for (Element t : tags) {
					tag+="("+t.text()+")";
				}
			}
			map.put("section", section);
			map.put("tag", tag);
			return map;
		} catch (IOException e) {
			System.err.println("访问文章页失败:"+url+"  原因"+e.getMessage());
		}
		return null;
	}
}
